library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(arm)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: lme4
##
## arm (Version 1.14-4, built: 2024-4-1)
##
## Working directory is /Users/Holden/Library/CloudStorage/Dropbox/My Mac (Holden’s MacBook Pro)/Desktop/dev/school/ocn_683
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
##
## The following object is masked from 'package:arm':
##
## logit
library(MASS)
acartia <- read_csv('data/dam_acartia_survival_subset.csv')
## New names:
## Rows: 91 Columns: 11
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (11): ...1, Generation, Treatment, Temp, pH, Rep, Beak, time, nx, lx, ni
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
yellow_tang <- read_csv('data/yellowtang.csv')
## New names:
## Rows: 1373 Columns: 60
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (21): commonname, region_name, island, site, reef_zone, depth_bin, dive... dbl
## (36): ...1, X, latitude, longitude, sitevisitid, obs_year, photographer... lgl
## (1): other_type dttm (2): time, date_
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
#1. Histogram of the data - use discrete.histogram()
# histogram for nx, this is the data of interest
discrete.histogram(acartia$nx)
#2. mean and variance of number of survivors - mean = 17.38 - var = 58.11
mean(acartia$nx)
## [1] 17.38462
var(acartia$nx)
## [1] 58.10598
what should the variance be, given the observed mean, if distribution is binomial? - var should be 19.28 if data is binomially distributed
# var = np(1-p)
# p, prob of success
p <- (mean(acartia$nx) / 25)
# n, number of trials
n <- nrow(acartia)
# var calc
n*p*(1-p)
## [1] 19.27606
#3. rbinom() to simulate draws from dist with same prob of survival, same length
# here's how we do it once
random_bugs <- rbinom(n, 25, p)
random_bugs %>%
discrete.histogram() %>%
mean() %>%
var()
## Warning in mean.default(.): argument is not numeric or logical: returning NA
## [1] NA
repeat this 4 more times
# loop through 5 different draws, printing histogram, mean, var
set.seed(123)
for (i in 1:5) {
random_bugs <- rbinom(n, 25, p)
# plot discrete histogram
discrete.histogram(random_bugs)
# calculate mean and variance
mean_value <- mean(random_bugs)
var_value <- var(random_bugs)
# print results
cat("\nIteration", i, ":\n")
cat("Mean:", mean_value, "\n")
cat("Variance:", var_value, "\n\n")
Sys.sleep(1)
}
##
## Iteration 1 :
## Mean: 17.36264
## Variance: 5.4337
##
## Iteration 2 :
## Mean: 17.38462
## Variance: 3.77265
##
## Iteration 3 :
## Mean: 17.26374
## Variance: 4.907448
##
## Iteration 4 :
## Mean: 17.49451
## Variance: 6.252747
##
## Iteration 5 :
## Mean: 17.47253
## Variance: 5.252015
how and why do simulated and observed data differ? - observed data has a much more skewed dist - it is zero-inflated - don’t see any 0’s in the simulated data - the observed data appears to be bimodal - peak at 0, and much higher survival for those that do survive (peaking at roughly 19) - some variation in simulated output, but - possible (but exceedingly unlikely) that sample size is too low if observed data truly follows the binomial dist, maybe would see more similarity with more replication - most likely is that the observed data does not follow the binomial dist! seems like there has to be some way to manage this zero-inflation….
#4. beta binomial - allows for prob of success to vary across trials
# using same n, p as above
beta_random_bugs <- rbetabinom(n, 25, p, rho = 0)
make a loop, create histograms across full possible range of rho (0:1)
set.seed(123)
rho_values <- seq(0, 1, length.out = 10) # generate 10 rho values from 0 to 1
simulate_beta_binomial <- function(rho_values) {
for (rho in rho_values) {
beta_random_bugs <- rbetabinom(n, 25, p, rho)
# plot discrete histogram
discrete.histogram(beta_random_bugs, xlab = "Count",
main = paste("Histogram for rho =", round(rho, 2)))
# calculate mean and variance
mean_value <- mean(beta_random_bugs)
var_value <- var(beta_random_bugs)
# print results
cat("\nRho =", round(rho, 2), ":\n")
cat("Mean:", mean_value, "\n")
cat("Variance:", var_value, "\n\n")
Sys.sleep(1)
}
}
simulate_beta_binomial(rho_values)
##
## Rho = 0 :
## Mean: 17.36264
## Variance: 5.4337
##
## Rho = 0.11 :
## Mean: 17.9011
## Variance: 11.80122
##
## Rho = 0.22 :
## Mean: 17.20879
## Variance: 35.14481
##
## Rho = 0.33 :
## Mean: 17.52747
## Variance: 45.05201
##
## Rho = 0.44 :
## Mean: 17.20879
## Variance: 63.05592
##
## Rho = 0.56 :
## Mean: 17.49451
## Variance: 75.38608
##
## Rho = 0.67 :
## Mean: 18.10989
## Variance: 87.05446
##
## Rho = 0.78 :
## Mean: 18.31868
## Variance: 94.8862
##
## Rho = 0.89 :
## Mean: 18.78022
## Variance: 103.5956
##
## Rho = 1 :
## Mean: 14.56044
## Variance: 153.6935
visually inspect, seems like between .3 to 1.0 is more promising, loop here
rho_values <- seq(.3, 1, length.out = 10)
simulate_beta_binomial(rho_values)
##
## Rho = 0.3 :
## Mean: 17.83516
## Variance: 52.51697
##
## Rho = 0.38 :
## Mean: 18.02198
## Variance: 56.17729
##
## Rho = 0.46 :
## Mean: 15.86813
## Variance: 71.02686
##
## Rho = 0.53 :
## Mean: 16.03297
## Variance: 80.47668
##
## Rho = 0.61 :
## Mean: 18.42857
## Variance: 75.46984
##
## Rho = 0.69 :
## Mean: 16.49451
## Variance: 87.16386
##
## Rho = 0.77 :
## Mean: 16.47253
## Variance: 112.9631
##
## Rho = 0.84 :
## Mean: 15.69231
## Variance: 136.3043
##
## Rho = 0.92 :
## Mean: 17.35165
## Variance: 120.7416
##
## Rho = 1 :
## Mean: 11.26374
## Variance: 156.4408
again, 0.53 to 0.84
rho_values <- seq(.53, .84, length.out = 10)
simulate_beta_binomial(rho_values)
##
## Rho = 0.53 :
## Mean: 18.62637
## Variance: 73.9033
##
## Rho = 0.56 :
## Mean: 15.27473
## Variance: 87.1348
##
## Rho = 0.6 :
## Mean: 17.79121
## Variance: 77.32259
##
## Rho = 0.63 :
## Mean: 18.49451
## Variance: 78.85275
##
## Rho = 0.67 :
## Mean: 18.64835
## Variance: 75.23053
##
## Rho = 0.7 :
## Mean: 18.45055
## Variance: 82.67253
##
## Rho = 0.74 :
## Mean: 17.48352
## Variance: 93.31917
##
## Rho = 0.77 :
## Mean: 15.92308
## Variance: 122.5607
##
## Rho = 0.81 :
## Mean: 16.65934
## Variance: 110.8049
##
## Rho = 0.84 :
## Mean: 16.79121
## Variance: 119.7448
try 0.77 to 0.9? Honestly not seeing any of these look super close
rho_values <- seq(.77, 0.9, length.out = 10)
simulate_beta_binomial(rho_values)
##
## Rho = 0.77 :
## Mean: 16.12088
## Variance: 107.8408
##
## Rho = 0.78 :
## Mean: 16.35165
## Variance: 109.4305
##
## Rho = 0.8 :
## Mean: 16.7033
## Variance: 114.3888
##
## Rho = 0.81 :
## Mean: 18.74725
## Variance: 98.36874
##
## Rho = 0.83 :
## Mean: 15.53846
## Variance: 123.4735
##
## Rho = 0.84 :
## Mean: 17.52747
## Variance: 111.1631
##
## Rho = 0.86 :
## Mean: 18.92308
## Variance: 100.1829
##
## Rho = 0.87 :
## Mean: 17.96703
## Variance: 112.5656
##
## Rho = 0.89 :
## Mean: 18.79121
## Variance: 100.6337
##
## Rho = 0.9 :
## Mean: 18.42857
## Variance: 106.5143
hmm, well if I have to pick one, I guess let’s go with rho = 0.67? - but again, none of these appropriately mirror the observed distribution in my opinion
#5. histogram of yellow tang counts
discrete.histogram(yellow_tang$count)
#6. mean var of count - mean = 5.03 - var = 35.97
mean(yellow_tang$count)
## [1] 5.026948
var(yellow_tang$count)
## [1] 35.96793
var if Poisson distribution - in Poisson, variance = mean - so, var = 5.03 in Poisson distribution - variance is way higher! Seems like this data is zero-inflated, preventing it from fitting the Poisson distribution
#7. simulate five sets of random draws from Poisson dist - use same looping strategy as above
n <- nrow(yellow_tang)
lambda <- mean(yellow_tang$count)
set.seed(123)
for (i in 1:5) {
random_fish <- rpois(n, lambda)
# plot discrete histogram
discrete.histogram(random_fish)
# calculate mean and variance
mean_value <- mean(random_fish)
var_value <- var(random_fish)
# print results
cat("\nIteration", i, ":\n")
cat("Mean:", mean_value, "\n")
cat("Variance:", var_value, "\n\n")
Sys.sleep(1)
}
##
## Iteration 1 :
## Mean: 4.962127
## Variance: 4.800314
##
## Iteration 2 :
## Mean: 5.067007
## Variance: 5.346819
##
## Iteration 3 :
## Mean: 5.064093
## Variance: 5.011924
##
## Iteration 4 :
## Mean: 4.92571
## Variance: 4.707305
##
## Iteration 5 :
## Mean: 5.059723
## Variance: 4.882728
how are these distributions different? - these simulated distributions more closely resemble the normal dist (maybe b/c n is so high?) - in the simulated distributions we lose the abundance of 0,1,2 count instances, and the tail of really large counts (up to 52!) - the simulated distributions appear to closely resemble the normal distribution, with a slight upward tail only up to 16
#8. trial and error parameter fitting from negative binomial dist - can model counts with more variability than Poisson dist. - use same strategy as above
n <- nrow(yellow_tang)
mu <- mean(yellow_tang$count)
set.seed(123)
theta_values <- seq(0.1, 1000, length.out = 10) # generate 10 theta values from 0 to 1000
simulate_neg_binomial <- function(theta_values) {
for (theta in theta_values) {
beta_random_fish <- rnegbin(n, mu, theta)
# plot discrete histogram
discrete.histogram(beta_random_fish, xlab = "Count",
main = paste("Histogram for theta =", round(theta, 2)))
# calculate mean and variance
mean_value <- mean(beta_random_fish)
var_value <- var(beta_random_fish)
# print results
cat("\nTheta =", round(theta, 2), ":\n")
cat("Mean:", mean_value, "\n")
cat("Variance:", var_value, "\n\n")
Sys.sleep(1)
}
}
# start with .1 - 1000
simulate_neg_binomial(theta_values)
##
## Theta = 0.1 :
## Mean: 5.01311
## Variance: 226.4998
##
## Theta = 111.2 :
## Mean: 5.002913
## Variance: 4.84693
##
## Theta = 222.3 :
## Mean: 4.941005
## Variance: 4.8471
##
## Theta = 333.4 :
## Mean: 4.941005
## Variance: 5.132814
##
## Theta = 444.5 :
## Mean: 5.010197
## Variance: 5.052374
##
## Theta = 555.6 :
## Mean: 4.922797
## Variance: 5.020274
##
## Theta = 666.7 :
## Mean: 4.958485
## Variance: 5.172474
##
## Theta = 777.8 :
## Mean: 5.038602
## Variance: 5.147926
##
## Theta = 888.9 :
## Mean: 5.003642
## Variance: 4.493427
##
## Theta = 1000 :
## Mean: 5.060452
## Variance: 5.091824
okay, awesome! clearly, theta needs to be a very small number, good to know! - let’s run again starting at 0.01 and going to 1, see how this goes
# now run with .01 - 1
theta_values <- seq(0.01, 1, length.out = 10)
simulate_neg_binomial(theta_values)
##
## Theta = 0.01 :
## Mean: 8.784414
## Variance: 6155.331
##
## Theta = 0.12 :
## Mean: 5.927167
## Variance: 307.6754
##
## Theta = 0.23 :
## Mean: 5.00874
## Variance: 111.2288
##
## Theta = 0.34 :
## Mean: 4.959213
## Variance: 80.57559
##
## Theta = 0.45 :
## Mean: 5.086672
## Variance: 64.22645
##
## Theta = 0.56 :
## Mean: 4.855062
## Variance: 44.3937
##
## Theta = 0.67 :
## Mean: 5.101238
## Variance: 43.19018
##
## Theta = 0.78 :
## Mean: 5.121631
## Variance: 33.45385
##
## Theta = 0.89 :
## Mean: 5.241806
## Variance: 37.11204
##
## Theta = 1 :
## Mean: 5.057538
## Variance: 28.24523
super cool, 0.78 seems to really closely approximate observed dist. - further refine
# refine with .6 - .9
theta_values <- seq(0.6, 0.9, length.out = 10)
simulate_neg_binomial(theta_values)
##
## Theta = 0.6 :
## Mean: 4.93445
## Variance: 45.50153
##
## Theta = 0.63 :
## Mean: 4.9563
## Variance: 43.8873
##
## Theta = 0.67 :
## Mean: 4.92571
## Variance: 41.01197
##
## Theta = 0.7 :
## Mean: 4.996358
## Variance: 40.25728
##
## Theta = 0.73 :
## Mean: 5.28405
## Variance: 44.42946
##
## Theta = 0.77 :
## Mean: 5.13984
## Variance: 39.53291
##
## Theta = 0.8 :
## Mean: 5.182083
## Variance: 40.97848
##
## Theta = 0.83 :
## Mean: 4.79898
## Variance: 30.54265
##
## Theta = 0.87 :
## Mean: 5.350328
## Variance: 37.14759
##
## Theta = 0.9 :
## Mean: 5.040787
## Variance: 33.86277
my final choice is theta = 0.73 - I like that it appropriately models the abundance of low count observations, and includes a tail for high count values that is close to the observed max of ~50